Introduction

The aim of this report is to outline the factors, both passenger controllable (e.g. the airline used) and uncontrollable (e.g. weather), that most influence whether a flight will arrive late along with the extent of this delay. Our goal is to allow potential passengers to make “smarter” booking decisions in order to minimise the possibility of experiencing delays, while also allowing them to better expect potential delays.

“To do this the report first outlines the details of the underlying data used. Following this, details of delays will be discussed. This will include both basic details of which airline experiences the most delays but will also detail of how severe these delays can be and how they vary by airline. Finally, the delay data relating to airlines is combined with other factors to give a more holistic view of the potential for passengers to face delays. Once identified these factors are combined and used to construct models to predict the potential delays passengers may face.” - edit

As the report’s title suggests, the data being considered relates to domestic US flights originating from New York City’s 3 airports (Newark Liberty International, John F Kennedy International and La Guardia) in 2013. This data is contained in the R ‘nycflights13’ package which contains 5 separate datasets. Details of these datasets can be found in Appendix A. Below we can see a quick overview of where these flights are going.

Maps

# Create edges for network graph

edges <- flights %>%
  filter(code=="N") %>%
  group_by(origin,dest) %>%
  summarise(weight = n())
## `summarise()` regrouping output by 'origin' (override with `.groups` argument)
# Add colour by origin for plot

edges$colour <- as.factor(edges$origin)

# Create graph

g <- graph.data.frame(edges, directed = T)

# Subset graph for number of flights per origin airport

n1 <- induced_subgraph(g, V(g)[-1:-2])
n2 <- induced_subgraph(g, V(g)[-2:-3])
n3 <- induced_subgraph(g, V(g)[c(-1,-3)])

# Add number of flights to each vertex

V(g)$weights <- graph.strength(g)
V(g)[-1:-2]$weights1 <- graph.strength(n1)
V(g)[1:3]$weights1 <- 0
V(g)[-2:-3]$weights2 <- graph.strength(n2)
V(g)[1:3]$weights2 <- 0
V(g)[c(-1,-3)]$weights3 <- graph.strength(n3)
V(g)[1:3]$weights3 <- 0
V(g)$sizes <- V(g)$weights
V(g)$sizes[1:3] <- mean(V(g)$sizes)

# Link encoded names to actual names

vertices <- data.frame(vname=V(g)$name) %>%
  left_join(airports, by=c("vname" = "faa"))

# Create hover text info

V(g)$text <- paste(vertices$name,
                     paste("Flights from LGA: ",format(V(g)$weights1, big.mark=",", trim=T)),
                     paste("Flights from EWR: ",format(V(g)$weights2, big.mark=",", trim=T)),
                     paste("Flights from JFK: ",format(V(g)$weights3, big.mark=",", trim=T)),
                     paste("Total Flights: ",format(V(g)$weights, big.mark=",", trim=T)),
                     sep = "<br>")

# Add geographical co-ordinates to vertices

V(g)$lat <- vertices$lat
V(g)$lon <- vertices$lon

# Link edges to vertices

ends <- data.frame(ename=get.edgelist(g)[,2]) %>% left_join(airports, by=c("ename" = "faa"))
E(g)$elat <- ends$lat
E(g)$elon <- ends$lon

# Create network

net <- ggnetwork(g)

# Get map data for background
library(rgeos)
## Loading required package: sp
## rgeos version: 0.5-3, (SVN revision 634)
##  GEOS runtime version: 3.8.0-CAPI-1.13.1 
##  Linking to sp version: 1.4-2 
##  Polygon checking: TRUE
world <- ne_countries(country="United States of America", returnclass = "sf")
states <- st_as_sf(maps::map("state", plot = FALSE, fill = TRUE))

# Plot network graph

plot <- ggplot(data=net) + geom_sf(data=world, size=0.3, fill = "antiquewhite1") + 
  geom_sf(data = states, size=0.1, fill = "NA") +
  geom_edges(mapping=aes(x = lon, y = lat, xend = elon, yend = elat, color=colour), size=0.6, alpha=0.3) +
  geom_nodes(mapping=aes(x = lon, y = lat, text=text),size=0.01, alpha = 0.6) +
  geom_text(mapping=aes(x = lon, y = lat, label="&#128743;", size=sizes)) + 
  scale_radius(range = c(3,6)) +
  guides(size=F) +
  theme_blank() +
  labs(colour = "Airport") +
  theme(panel.background = element_rect(fill = "aliceblue")) +
  labs(title="Network Map of US Flights from New York in 2013", subtitle = "(Hover for airport names and flight numbers)")

# Add interactivity

plot %>% ggplotly(tooltip="text", width=900, height=500) %>% layout(
                    title = list(text = paste0("Network Map of US Flights from New York in 2013","<br>",
                                               "<sup>","(Hover for airport names and flight numbers)","</sup>")),
                    xaxis = list(range = c(-125, -65)),
                    yaxis = list(range = c(24, 50)))
flights_1 <- flights %>% filter(code=="N")
nflights <- flights_1 %>% group_by(dest) %>% summarise(n = n(),d=mean(arr_delay)) %>% left_join(airports, by = c("dest"="faa"))
## `summarise()` ungrouping output (override with `.groups` argument)
g <- list(
  scope = 'usa',
  showland = TRUE,
  landcolor = toRGB("gray95"),
  subunitcolor = toRGB("gray85"),
  countrycolor = toRGB("gray85"),
  countrywidth = 0.5,
  subunitwidth = 0.5
)

fig <- plot_geo(nflights, lat = ~lat, lon = ~lon)
fig <- fig %>% add_markers(
  text = ~paste(name, dest, paste("Number of flights:", n), paste("Average delay:", d), sep = "<br />"),
  color = ~d, symbol = I("circle"), colors=viridis_pal(direction = -1,option="B")(104), size = ~n, hoverinfo = "text"
)

fig <- fig %>% colorbar(title = "Average Delay")
fig <- fig %>% layout(title = 'Airports<br />(Hover for airport)', geo = g)
fig
fly_filt <- flights%>%
  filter(code=="N")%>%
    select(year, month, day, sched_dep_time, dep_time, dep_delay, arr_time, arr_delay, carrier, origin, dest, distance)


ap_filtered <- airports%>%
  select(faa, name, lat, lon)%>%
    rename(origin = faa)

fly_filt <- inner_join(fly_filt, ap_filtered)%>%
  rename(origin_name = name, origin_lat = lat, origin_lon = lon)%>%
    select(-origin)
## Joining, by = "origin"
ap_filtered <- airports%>%
  select(faa, name, lat, lon)%>%
  rename(dest = faa)

fly_filt <- inner_join(fly_filt, ap_filtered)%>%
  rename(dest_name = name, dest_lat = lat, dest_lon = lon)%>%
    select(-dest)
## Joining, by = "dest"
#The non-US airports (the ones from overseas dependencies) show as NA so these are being removed
fly_filt <- fly_filt%>%
  filter(!is.na(dest_name))

# Create plot data and calcuate distances
dest_det <- st_as_sf(fly_filt, coords = c('dest_lon', 'dest_lat'), crs = 4326)%>%
    group_by(dest_name)%>%
      summarise(count = n())%>%
        select(dest_name, count, geometry)%>%
          rename(name = dest_name)
## `summarise()` ungrouping output (override with `.groups` argument)
fly_filt <- fly_filt%>%
    mutate(time_lost = arr_delay - dep_delay)%>%
        select(year, month, day, sched_dep_time, dep_time, dep_delay, arr_time, arr_delay, time_lost, carrier, origin_name, dest_name, distance)

#Categorisethe times fo the days to reduce analytic volume
#No flights between midnight and 5am 
fly_filt <- fly_filt%>%
  mutate(Intervals = if_else(sched_dep_time < 900, "Early_Morn",
                             if_else(sched_dep_time > 859 & sched_dep_time < 1200, "Late_Morn",
                                     if_else(sched_dep_time > 1159 & sched_dep_time < 1500, "Early_Aft",
                                             if_else(sched_dep_time > 1459 & sched_dep_time < 1800, "Late_Aft",
                                                     if_else(sched_dep_time > 1759 & sched_dep_time < 2100, "Early_Eve",
                                                             "Late_Eve")
                                                     )
                                             )
                                     )
                             )
  )
sts_con <- us_states %>%
              select(name = NAME)

sts_als <- alaska%>%
    select(name = NAME)%>%
      mutate("count" = 0)

sts_haw <- hawaii%>%
    select(name = NAME)%>%
      mutate("count" = 0)

dest_us <- dest_det%>% 
  filter(!(name %in% c("Honolulu Intl", "Ted Stevens Anchorage Intl")))%>%
  st_transform(crs = st_crs(us_states))

dest_a <- dest_det%>% 
  filter(name == "Ted Stevens Anchorage Intl")%>%
  st_transform(crs = st_crs(alaska))

dest_h <- dest_det%>% 
  filter(name == "Honolulu Intl")%>%
  st_transform(crs = st_crs(hawaii))

brks_seq <- seq(1, max(dest_us$count, by = 250))
sz <- .25

US_Map <- 
  tm_shape(sts_con) +
    tm_polygons()+
      tm_layout(frame = FALSE)+
  tm_shape(dest_us) +
    tm_dots(size = sz, col = "count", legend.show = FALSE, breaks = brks_seq)+
        tm_layout(frame = FALSE)


  tm_shape(sts_con) +
    tm_polygons()+
      tm_layout(frame = FALSE)+
  tm_shape(dest_us) +
    tm_bubbles("count", breaks = brks_seq)+
        tm_layout(frame = FALSE)

H_map <- 
  tm_shape(sts_haw) +
    tm_polygons()+
      tm_layout(frame = FALSE)+
  tm_shape(dest_h) +
    tm_dots(size = sz, col = "count", legend.show = FALSE, breaks = brks_seq)+
      tm_layout(frame = FALSE)

A_Map <-
  tm_shape(sts_als) +
    tm_polygons()+
      tm_layout(frame = FALSE)+
  tm_shape(dest_a) +
    tm_dots(size = sz, col = "count", legend.show = FALSE, breaks = brks_seq)+
      tm_layout(frame = FALSE)

US_Map
print(H_map, vp = grid::viewport(0.45, 0.1, width = 0.2, height = 0.1))
print(A_Map, vp = grid::viewport(0.15, 0.15, width = 0.3, height = 0.3))

Proportion of airlines plots:

flights_1 <- flights %>%
  filter(code=="N") %>%
  left_join(airports, by=c("origin"="faa")) %>%
  left_join(airlines, by="carrier") %>%
  select(c("name.x","name.y"))
names(flights_1)<-c("origin_airport","carrier_name")
marketshare <- flights_1 %>% group_by(carrier_name) %>% summarise(count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
flights_1$carrier_name <- factor(flights_1$carrier_name, levels = marketshare$carrier_name[order(-marketshare$count)]) 
g <- ggplot(flights_1,aes(x=carrier_name,fill=origin_airport))+
  geom_bar()+
  ylab("Count of Flight")+
  xlab("Airline")+
  theme( axis.text.x = element_text(angle = 45))
ggplotly(g)

Click to zoom in and see distribution per origin airport. Click title to zoom back out.

d3tree2(g  ,rootname = "Proportion of flights per airline", width = 500, height = 300)
flights_1 <- flights %>%
  filter(code=="N") %>%
  left_join(airports, by=c("origin"="faa")) %>%
  left_join(airlines, by="carrier") %>%
  select(c("name.x","name.y"))

names(flights_1)<-c("origin","carrier")

share <- flights_1 %>% 
  filter(!(carrier %in% c("Alaska Airlines Inc.", "Frontier Airlines Inc.", 
                          "Hawaiian Airlines Inc.", "Mesa Airlines Inc.", "SkyWest Airlines Inc.")))%>%
  group_by(carrier, origin) %>% 
    summarise(count=n())
## `summarise()` regrouping output by 'carrier' (override with `.groups` argument)
share <- share %>%
  mutate(perc = 100*count/sum(share$count))

wdths <- share%>%
  group_by(carrier)%>%
    summarise(wdt = sum(perc))%>%
      arrange(wdt)%>%
        mutate(pos = cumsum(wdt)-wdt/2)
## `summarise()` ungrouping output (override with `.groups` argument)
share <- full_join(share, wdths, by = "carrier")
     
ggplot(data = share)+
  geom_bar(aes(fill = origin, x=pos, y = perc, width = wdt), colour = 'white', position = 'fill', stat = 'identity')+ 
  scale_x_continuous(label = share$carrier, breaks = share$pos)+
  scale_fill_manual(values = c('skyblue2', 'dodgerblue2', 'steelblue4'))+
  coord_flip()+
  theme(legend.position = 'bottom')+
  labs(x='Airline', y = 'Proportion in given airport')

Flights delay by dest

# Convert day perids to factors
fly_filt$Intervals <- factor(fly_filt$Intervals)

fly_filt <- fly_filt%>%
  mutate(dist_grp = cut_width(distance, 1000, center = 500))

levels(fly_filt$Intervals) <- c("Early_Morn", "Late_Morn", "Early_Aft", "Late_Aft", "Early_Eve", "Late_Eve")


p <- fly_filt %>% filter(arr_delay>5)%>%
  nrow()/nrow(fly_filt)

paste("Percentage of flights that are late", sprintf("%.2f %%", 100*p))
## [1] "Percentage of flights that are late 33.55 %"
p <- fly_filt %>% filter(arr_delay>30)%>%
  nrow()/nrow(fly_filt)

paste("Percentage of flights more than half an hour late", sprintf("%.2f %%", 100*p))
## [1] "Percentage of flights more than half an hour late 15.73 %"
p <- fly_filt %>% filter(arr_delay>60)%>%
  nrow()/nrow(fly_filt)

paste("Percentage of flights more than an hour late", sprintf("%.2f %%", 100*p))
## [1] "Percentage of flights more than an hour late 8.49 %"
p <- fly_filt %>% filter(arr_delay>120)%>%
  nrow()/nrow(fly_filt)

paste("Percentage of flights more than 2 hours late", sprintf("%.2f %%", 100*p))
## [1] "Percentage of flights more than 2 hours late 3.07 %"
p <- fly_filt%>%
  filter(arr_delay > -sort(-fly_filt$arr_delay)[21])%>%
  mutate(cal_date = dmy(paste(day, month, year, sep = ' ')))%>%
    select(cal_date, dep_time, dep_delay, arr_time, arr_delay, carrier, origin_name, dest_name, distance)

print(kable(p, caption = "Most delayed flights", format = "markdown"))
## 
## 
## Table: Most delayed flights
## 
## |cal_date   | dep_time| dep_delay| arr_time| arr_delay|carrier |origin_name         |dest_name                         | distance|
## |:----------|--------:|---------:|--------:|---------:|:-------|:-------------------|:---------------------------------|--------:|
## |2013-01-01 |      848|       853|     1001|       851|MQ      |John F Kennedy Intl |Baltimore Washington Intl         |      184|
## |2013-01-09 |      641|      1301|     1242|      1272|HA      |John F Kennedy Intl |Honolulu Intl                     |     4983|
## |2013-01-10 |     1121|      1126|     1239|      1109|MQ      |Newark Liberty Intl |Chicago Ohare Intl                |      719|
## |2013-11-03 |      603|       798|      829|       796|DL      |Newark Liberty Intl |Hartsfield Jackson Atlanta Intl   |      746|
## |2013-12-05 |      756|       896|     1058|       878|AA      |Newark Liberty Intl |Miami Intl                        |     1085|
## |2013-12-14 |      830|       825|     1210|       856|DL      |John F Kennedy Intl |Tampa Intl                        |     1005|
## |2013-12-17 |      705|       845|     1026|       846|AA      |Newark Liberty Intl |Miami Intl                        |     1085|
## |2013-12-19 |      734|       849|     1046|       847|DL      |Newark Liberty Intl |Salt Lake City Intl               |     1969|
## |2013-02-10 |     2243|       853|      100|       834|F9      |La Guardia          |Denver Intl                       |     1620|
## |2013-03-17 |     2321|       911|      135|       915|DL      |La Guardia          |Minneapolis St Paul Intl          |     1020|
## |2013-04-10 |     1100|       960|     1342|       931|DL      |John F Kennedy Intl |Tampa Intl                        |     1005|
## |2013-04-19 |      912|       812|     1228|       821|DL      |La Guardia          |Tampa Intl                        |     1010|
## |2013-05-03 |     1133|       878|     1250|       875|MQ      |Newark Liberty Intl |Chicago Ohare Intl                |      719|
## |2013-05-19 |      713|       853|     1007|       852|AA      |John F Kennedy Intl |Mc Carran Intl                    |     2248|
## |2013-06-15 |     1432|      1137|     1607|      1127|MQ      |John F Kennedy Intl |Port Columbus Intl                |      483|
## |2013-06-27 |      753|       803|      937|       802|AA      |La Guardia          |Lambert St Louis Intl             |      888|
## |2013-06-27 |      959|       899|     1236|       850|DL      |John F Kennedy Intl |Portland Intl                     |     2454|
## |2013-07-22 |      845|      1005|     1044|       989|MQ      |John F Kennedy Intl |Cincinnati Northern Kentucky Intl |      589|
## |2013-07-22 |     2257|       898|      121|       895|DL      |La Guardia          |Hartsfield Jackson Atlanta Intl   |      762|
## |2013-09-20 |     1139|      1014|     1457|      1007|AA      |John F Kennedy Intl |San Francisco Intl                |     2586|
fly_filt %>% filter(arr_delay>5)%>%
  group_by(dest_name)%>%
  summarise(c = n())%>%
    arrange(-c)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 103 x 2
##    dest_name                              c
##    <chr>                              <int>
##  1 Hartsfield Jackson Atlanta Intl     6417
##  2 Chicago Ohare Intl                  5219
##  3 Los Angeles Intl                    4804
##  4 Charlotte Douglas Intl              4711
##  5 Orlando Intl                        4520
##  6 Fort Lauderdale Hollywood Intl      4356
##  7 San Francisco Intl                  4110
##  8 General Edward Lawrence Logan Intl  3814
##  9 Ronald Reagan Washington Natl       3252
## 10 Miami Intl                          3152
## # ... with 93 more rows

Flights delay/early per carrier

# Remove cancelled/diverted flights and merge airports and flights datasets
plot_a <- flights%>%
  filter(code=="N")%>%
  full_join(airlines, by="carrier")%>%
  mutate(type = "On Time")

plot_a$arr_delay <- ifelse(is.na(plot_a$arr_delay), 
                       (60*floor(plot_a$arr_time/100) + plot_a$arr_time%%100 - 
                          (60*floor(plot_a$sched_arr_time/100) + plot_a$sched_arr_time%%100))%%1440,
                        plot_a$arr_delay)

plot_a$type <- ifelse(plot_a$arr_delay > 10, "Late, within 30 minutes", plot_a$type)
plot_a$type <- ifelse(plot_a$arr_delay > 30, "Late, between 30 min and 1 hour", plot_a$type)
plot_a$type <- ifelse(plot_a$arr_delay > 60, "Late, between 1 and 2 hours", plot_a$type)
plot_a$type <- ifelse(plot_a$arr_delay > 120, "Late, over 2 hours", plot_a$type)
plot_a$type <- ifelse(plot_a$arr_delay < -10, "Ahead of Schedule", plot_a$type)
plot_a$type <- ifelse(is.na(plot_a$arr_time), "Diverted", plot_a$type)
plot_a$type <- ifelse(is.na(plot_a$dep_time), "Cancelled", plot_a$type)



plot_a <- plot_a%>%
  group_by(name, type)%>%
    summarise(count = n())
## `summarise()` regrouping output by 'name' (override with `.groups` argument)
plot_a1 <- plot_a%>%
  group_by(name)%>%
    summarise(tot = sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_a2 <- plot_a%>%
  group_by(type)%>%
    summarise(tot = sum(count))%>%
      mutate(perc = tot/sum(tot))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_a <- full_join(plot_a, plot_a1, by="name")%>%
  mutate(perc = count/tot)%>%
  select(name, type, perc)

p1_avg <- sum(filter(plot_a2, type %in% c("On Time", "Ahead of Schedule"))$perc)
p2_avg <- sum(filter(plot_a2, type %in% c("Late, over 2 hours", "Late, between 1 and 2 hours"))$perc)

p1 <- plot_a%>%
  filter(type %in% c("On Time", "Ahead of Schedule"))%>%
    group_by(name)%>%
      summarise(perc = sum(perc))%>%
        arrange(perc)
## `summarise()` ungrouping output (override with `.groups` argument)
p1$name <- factor(p1$name, p1$name)

p1_p <- ggplot()+
  geom_bar(data = p1, aes(y = perc*100, x= name), fill = ifelse(p1$perc > p1_avg, "green4", "red2"), 
            width = 0.5, stat = 'identity')+
  geom_hline(aes(yintercept = p1_avg*100))+
  labs(y = 'Total Flights (%)', x = 'Airline', title = 'On time, or early')+
  theme(text = element_text(size=9))+
  coord_flip(ylim = c(50, 85), )


p2 <- plot_a%>%
  filter(type %in% c("Late, over 2 hours", "Late, between 1 and 2 hours"))%>%
    group_by(name)%>%
      summarise(perc = sum(perc))%>%
        arrange(-perc)
## `summarise()` ungrouping output (override with `.groups` argument)
p2$name <- factor(p2$name, p2$name)

p2_p <- ggplot()+
  geom_bar(data = p2, aes(y = perc*100, x= name), fill = ifelse(p2$perc < p2_avg, "green4", "red2"), 
            width = 0.5, stat = 'identity')+
  geom_hline(aes(yintercept = p2_avg*100))+
  coord_flip()+
  labs(y = 'Total Flights (%)', x = 'Airline', title = 'Delayed by over 1 hour')+
              theme(text = element_text(size=9))


grid.arrange(p1_p, p2_p, ncol= 2,
            top ='Comparison of on time and heavily delayed flights')

plot_a$type <- factor(plot_a$type, levels = c("Cancelled",
                                              "Diverted",
                                              "Late, over 2 hours", 
                                              "Late, between 1 and 2 hours",
                                              "Late, between 30 min and 1 hour", 
                                              "Late, within 30 minutes", 
                                              "On Time",
                                              "Ahead of Schedule"))
datafly <- flights %>% filter(code=="N")
ggdata <- datafly %>% group_by(carrier) %>% summarise(avg_mean = mean(arr_delay),num = n()) %>% arrange(avg_mean) %>% left_join(airlines,by="carrier")
## `summarise()` ungrouping output (override with `.groups` argument)
ggdata_new = mutate(ggdata, col = paste(carrier , "- (" , round(avg_mean,2),")"))
top_1 <- (slice(ggdata_new,1))$name
title <- paste("Airline ",top_1," is more on time than others")
ggdata_new %>%
  ggplot( aes(x=num, y=avg_mean, group=carrier, color=carrier)) +
  geom_label(aes(label = col),nudge_x = 7,nudge_y = 2) + ggplot2::geom_point() +
  scale_color_viridis(discrete = TRUE) +
  theme(
    legend.position="none",
    plot.title = element_text(size=12)
  ) +  coord_cartesian(xlim = c(-3000,70000),ylim=c(-10,30)) +
  ggtitle(title) 

analysis of flights by day of week/hour/month

ssdata <- head(arrange(datafly,time_hour) %>% 
                filter(dep_delay <= 0) %>% 
                group_by(time_hour) %>%
                count(weekdays(as.POSIXct(time_hour))) %>% 
                arrange(desc(n)),200) %>% 
                separate(time_hour,c("Date","Time"),sep=" ") %>% 
                separate(Time,c("Hr","mm","ss"),sep=":")



g <- ggplot(ssdata, aes(x=`weekdays(as.POSIXct(time_hour))`, y=Hr,color=n,size=n)) + geom_point(stat="identity") + transition_time(as.integer(rownames(ssdata))) +  geom_text(aes(label=n),nudge_y = 0.25) + ease_aes('linear',interval=12)+ theme_bw() + labs(x="Weekdays",y="hour")

animate(g,duration=400,width=400,height=400)

ssdata <- arrange(datafly,time_hour) %>% 
                group_by(weekdays(time_hour),hour(time_hour)) %>%
                summarise(n_total=n(), n_early=sum(dep_delay<=0)) %>%
                mutate(p_early = n_early/n_total)
## `summarise()` regrouping output by 'weekdays(time_hour)' (override with `.groups` argument)
names(ssdata)[1:2] <- c("Weekday","Hour")


names(ssdata)[1:2] <- c("Weekday","Hour")
ssdata$Weekday <- factor(ssdata$Weekday, levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))

g <- ggplot(ssdata, aes(x=Weekday, y=Hour,fill=p_early)) + geom_tile()
g

noofinst <- arrange(datafly,time_hour) %>% group_by(time_hour) %>% 
    group_by(origin) %>% select( flight,carrier,origin,month,time_hour,sched_dep_time,dep_delay) %>% arrange(desc(origin)) %>% arrange(sched_dep_time) %>% group_by(origin,time_hour) %>% select(carrier) %>% count(time_hour)  %>% separate(time_hour,c("Date","Time"),sep=" ") %>% 
separate(Time,c("Hr","mm","ss"),sep=":")
## Adding missing grouping variables: `origin`, `time_hour`
h <- ggplot(noofinst,aes(x=Hr,y=n,group=origin,color=origin))+geom_point()+geom_line() + transition_reveal(as.integer(rownames(noofinst))) + theme_bw() + labs(x="Time",y="Noofflight")

animate(h,width=400,height=400,start_pause=20,end_pause = 20)

#Checking the number flight per month per origin

origin_num <- flights %>% 
   filter(code=="N") %>%
   group_by(month, origin) %>%
   summarise(count = n())
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
#Checking the number of departures per month for each origins.

O_JFK<-origin_num %>% filter(origin =='JFK')
O_EWR<-origin_num %>% filter(origin =='EWR')
O_LGA<-origin_num %>% filter(origin =='LGA')

#Comparing the number of flights from each origin per month.

ggplot()+
  geom_line(data = O_JFK, aes(x = month, y = count), color = "blue") +
  geom_line(data = O_EWR, aes(x = month, y = count), color = "red") +
  geom_line(data = O_LGA, aes(x = month, y = count), color = "black")

We can see that EWR is operating consistantly the maximum number of flights to other destinations followed by JFK(2nd highest) and LGA (3rd).

We can see a different trend during the montn of August to September where the flights departed from EWR aand JFK has been reduced but for LGA the flights departed to other destinations has been slightly increased.

load(“St662 Project imputed.RData”)

# ------------------ Plot 1 -------------------------


delay_per_month <- flights %>% filter(code=="N") %>% group_by(month) %>% summarise(avg_delay = mean(dep_delay))
## `summarise()` ungrouping output (override with `.groups` argument)
g1 <- ggplot(delay_per_month, aes(x=month, y= avg_delay)) +
  geom_segment( aes(x=month, xend=month, y=0, yend=avg_delay), color = "brown") +
  geom_point( color="darkblue", size=4, alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank())+
   theme(plot.margin = unit(c(0.7,0.7,0.7,0.7), "cm"))+
  scale_x_discrete(labels = month.abb[c(1:12)])+
  xlab("Month")+ ylab("Average Departure Delay (in minutes)")
# ------------------ Plot 2 -------------------------


delay_per_hour <- flights %>% filter(code=="N") %>% group_by(hour) %>% summarise(avg_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
g2 <- ggplot(delay_per_hour, aes(x=hour, y= avg_delay)) +
  geom_segment( aes(x=hour, xend=hour, y=0, yend=avg_delay), color = "brown") +
  geom_line()+
  geom_point( color="darkblue", size=4, alpha=0.6) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank())+
   theme(plot.margin = unit(c(0.7,0.7,0.7,0.7), "cm"))+
  coord_flip()+
  scale_x_continuous(name="Hour of the day", breaks = seq(0, 23, by = 1))+
  scale_y_continuous(name="Average Departure Delay (in minutes)", breaks = seq(0, 25, by = 5))
grid.arrange(g1, g2, ncol=2, nrow = 1, top = textGrob("Departure Delay Statistics",
                                                      gp=gpar(fontsize=18,font=3, col ="darkblue")))

# ------------------ Plot 3 -------------------------


delay_per_day <- flights_model %>% group_by(day) %>% summarise(avg_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
g3 <- ggplot(delay_per_day, aes(x=day, y= avg_delay)) +
  # geom_segment( aes(x=day, xend=day, y=0, yend=avg_delay), color = " brown")
  geom_line()+
  geom_point( color="skyblue", size=4, alpha=0.6) +
  theme_light() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank())+
   theme(plot.margin = unit(c(0.7,0.7,0.7,0.7), "cm"))+
  # coord_flip()+
  scale_x_continuous(name="Day", breaks = seq(1, 31, by = 1))+
  scale_y_continuous(name="Average Departure Delay (in minutes)", breaks = seq(0, 30, by = 5))

g3

data <- flights %>% filter(code=="N")  %>%
  mutate(q = quarter(time_hour))
data$q = as.factor(data$q)
d1 <- data %>% group_by(q) %>% summarise(avg_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
d1 %>%  
  ggplot( aes(x=q, y=avg_delay, fill=q)) +
  geom_col(width = 0.7) +
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  theme_light()+
  theme(
    axis.text=element_text(size=12),
    axis.title=element_text(size=14,face="bold"),
    plot.title = element_text(size=17, colour = "darkblue"),
    legend.position="none",
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  ) + 
  coord_flip()+
  ggtitle("Departure Delay For Each Quarter of 2013") +
  xlab("Quarter") + ylab("Average Departure Delay (in minutes)")

Plots for weather

ggcorr(weather[, c(7:11,13:15)], method = c("everything", "pearson"), nbreaks = 4, 
       palette = "RdGy", label = TRUE, label_size = 4, label_round = 2,
       label_color="white", legend.size = 12) 

#Checking delayed and non-delayed flights. 
delayed <- filter(flights,dep_delay>0,code=="N")
not_delayed <- filter(flights,dep_delay<=0,code=="N")

#Checking delays by origin and time hour.
delay_th_origin <- group_by(delayed,origin,time_hour)

#Checking counts of total delays in each time hour. 
add_delaycount<- summarize(delay_th_origin,tot_delay = mean(dep_delay),
count = n())
## `summarise()` regrouping output by 'origin' (override with `.groups` argument)
#Merging weather data by origin and time hour.
join_weather <- merge(add_delaycount, weather,by=c("origin","time_hour"))

#Grouping by wind_speed to see trends between delays and the weather variables
mer_ws <- group_by(join_weather,wind_speed)

avgdelay_ws <- summarize(mer_ws,avg_dep_delay_t = mean(tot_delay))
## `summarise()` ungrouping output (override with `.groups` argument)
ws_delay<-ggplot(avgdelay_ws, aes(x = wind_speed, y = avg_dep_delay_t)) + geom_point() + geom_smooth(method = "lm") +labs(x = "Wind Speed in mph", y="Avg. Departure delay in minutes", title = "Wind Speed vs. Avg. departure delay")

ggplotly(ws_delay)
## `geom_smooth()` using formula 'y ~ x'

From the above we can say that there is linear relation between increase in wind speed and departure delay. There are few outliars can be observed as well.

#delays due to visibility
mer_viz <- group_by(join_weather,visib)

#Checking counts of total delays in each time hour due to visibility
viz_delay <- summarize(mer_viz,avg_dep_delay_t = mean(tot_delay))
## `summarise()` ungrouping output (override with `.groups` argument)
#Checking avg. dep_delay count per visibility
num_of_delay_per_viz = summarize(mer_viz,Avg_Delay_Count_Per_Viz = mean(count))
## `summarise()` ungrouping output (override with `.groups` argument)
visibilty_delay <- ggplot(viz_delay, aes(x = visib, y = avg_dep_delay_t, color=avg_dep_delay_t)) + geom_point() + geom_smooth(method = "lm") +labs(x = "Visibility in miles",
y="Average Departure Delay in minutes",
title = "Visibility vs. Average Departure Delay")

ggplotly(visibilty_delay)
## `geom_smooth()` using formula 'y ~ x'
weather_1 <- weather
weather_1$month <- as.factor(weather_1$month)
weather_1$day <- as.factor(weather_1$day)
weather_1$hour <- as.factor(weather_1$hour)


delayed <- filter(flights,dep_delay>0, code=="N")
not_delayed <- filter(flights,dep_delay<=0, code=="N")





grouped = group_by(delayed, origin, time_hour)
sum_delay_count = summarize(grouped, totaldelay = mean(dep_delay), count = n())
## `summarise()` regrouping output by 'origin' (override with `.groups` argument)
combined = merge(sum_delay_count, weather,by=c("origin","time_hour"))
#grouping by visibility 
by_visib <- combined %>% group_by(visib)

#Calculating average delay in time per visib
avg_delay_v = by_visib %>% summarize(avg_dep_delay_time = mean(totaldelay))
## `summarise()` ungrouping output (override with `.groups` argument)
#Calculating average dep_delay count per visib
number_of_delay_per_visib = by_visib %>% summarize(Avg_Delay_Count_Per_Visib = mean(count))
## `summarise()` ungrouping output (override with `.groups` argument)
p1 <- ggplot(avg_delay_v, aes(x = visib, y = avg_dep_delay_time))+
  geom_point() + 
  geom_smooth(method = "loess", color = "red") +
  labs(x = "Visibility (miles)", y="Average Departure Delay Time (minutes)",
title = "Average Departure Delay Time vs Visibility")

p1
## `geom_smooth()` using formula 'y ~ x'

p2 <- ggplot(number_of_delay_per_visib, aes(x = visib, y = Avg_Delay_Count_Per_Visib))
p2 + geom_point()+ geom_smooth(method = "loess", color = "red") +labs(x = "Visibility (miles)",
y="Average Number of Delays",
title = "Average Number of Delays vs Visibility")
## `geom_smooth()` using formula 'y ~ x'

Model for departure delay prediction

weather_1 <- weather
planes_1 <- planes
weather_1$weather_id<-nrow(weather)
planes_1$planes_id<-nrow(planes)


fl_we <- flights_model %>% left_join(weather_1[,c(1,6:16)], c("origin","time_hour"))
fl_we_pl <- fl_we %>% left_join(planes_1, by =  "tailnum")



fl_we_pl<-fl_we_pl%>%
  filter(is.na(planes_id)!=1,is.na(weather_id)!=1)

sum(is.na(fl_we_pl$planes_id))
## [1] 0
sum(is.na(fl_we_pl$weather_id))
## [1] 0
flights_model_1<-fl_we_pl%>%
  mutate(flag=ifelse(dep_delay<=0,0,ifelse(dep_delay<=15,1,ifelse(dep_delay<=45,2,3))))

table(flights_model_1$flag)
## 
##      0      1      2      3 
## 159624  48663  29183  28606
#### variables for making model

data_model<-flights_model_1%>%
  mutate(planes_age=ifelse((2013-year.y)>30,30,(2013-year.y)),
         dep_hour=as.numeric(ifelse(nchar(sched_dep_time)==4,substr(sched_dep_time,1,2),substr(sched_dep_time,1,1)))
  )%>%
  select(flag,month,dep_hour,carrier,origin,planes_age,seats,
         temp,dewp,humid,wind_dir,wind_speed,pressure,visib,precip)%>%
  filter(is.na(planes_age)!=1)


######### model trying
#### logistic

data_model_1<-data_model%>%
  mutate(flag=ifelse(flag %in% c("1", "0"),1,0))%>%
  select(flag,month,dep_hour,
         planes_age,seats, 
         temp,dewp,humid,wind_dir,wind_speed,pressure,visib,precip,
         origin)

# move: carrier ,wind_dir+
set.seed(123)
s<-sample(nrow(data_model_1),0.6*nrow(data_model_1))
data_model_train<-data_model_1[s,]
data_model_test<-data_model_1[-s,]




fit_log <- glm(flag ~month+dep_hour+                      # departure time
                 planes_age+seats+                        # plane's type
                 temp+dewp+humid+wind_speed+pressure+visib+precip+
                 origin,
               data = data_model_train, family = binomial())


summary(fit_log)
## 
## Call:
## glm(formula = flag ~ month + dep_hour + planes_age + seats + 
##     temp + dewp + humid + wind_speed + pressure + visib + precip + 
##     origin, family = binomial(), data = data_model_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7092   0.3442   0.5283   0.7220   2.0597  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.718e+01  1.047e+00 -16.405  < 2e-16 ***
## month        1.423e-02  2.057e-03   6.915 4.66e-12 ***
## dep_hour    -1.385e-01  1.467e-03 -94.397  < 2e-16 ***
## planes_age   4.206e-03  1.113e-03   3.780 0.000157 ***
## seats        1.864e-03  9.465e-05  19.698  < 2e-16 ***
## temp        -2.881e-02  3.681e-03  -7.827 4.98e-15 ***
## dewp         2.541e-02  3.934e-03   6.460 1.05e-10 ***
## humid       -2.655e-02  1.938e-03 -13.695  < 2e-16 ***
## wind_speed  -2.603e-02  1.288e-03 -20.202  < 2e-16 ***
## pressure     2.168e-02  9.909e-04  21.882  < 2e-16 ***
## visib        2.328e-02  4.151e-03   5.610 2.03e-08 ***
## precip      -1.729e+00  1.998e-01  -8.653  < 2e-16 ***
## originJFK    4.233e-01  1.577e-02  26.844  < 2e-16 ***
## originLGA    2.740e-01  1.666e-02  16.446  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 164053  on 156726  degrees of freedom
## Residual deviance: 148854  on 156713  degrees of freedom
## AIC: 148882
## 
## Number of Fisher Scoring iterations: 4
## for train # 
predictions <- predict(fit_log,data_model_train,type = "response")%>%
  round(digits = 2)


data_model_train<-data_model_train%>%
  mutate(pred_outcome =ifelse(predictions > 0.5,1,0))

sum(data_model_train$flag!=data_model_train$pred_outcome)/nrow(data_model_train)
## [1] 0.2109656
## for test # 

predictions_test <- predict(fit_log,data_model_test,type = "response")%>%
  round(digits = 2)


data_model_test<-data_model_test%>%
  mutate(pred_outcome =ifelse(predictions_test > 0.5,1,0))

table(data_model_test$flag,data_model_test$pred_outcome)
##    
##         0     1
##   0  2318 20375
##   1  1641 80151
sum(data_model_test$flag!=data_model_test$pred_outcome)/nrow(data_model_test)
## [1] 0.2107097
# total data

data_model_train$ts<-"train"
data_model_test$ts<-"test"

data_plot<-rbind(data_model_train[,c("ts","flag","pred_outcome")],data_model_test[,c("ts","flag","pred_outcome")])

data_plot<-data_plot%>%
  mutate(right=ifelse(flag==pred_outcome,"right","wrong"))

tab2<-table(data_plot$right,data_plot$ts)

cols <- c("seagreen3","steelblue4", "purple2", "sienna1","slategray3")

barplot(prop.table(tab2,2),beside=TRUE,col=cols[1:2],
        legend.text=TRUE,args.legend= (list(x=13.5,y=.7)))

Model for Arrival delay prediction

#Fitted GLM model to predict arrival delay using some important predictors from weather dataset and flights dataset which.
flights.2013 <-
  flights_model %>%
  left_join(
    weather %>% select(
      origin,
      temp,
      dewp,
      humid,
      wind_dir,
      wind_speed,
      wind_gust,
      precip,
      pressure,
      visib,
      time_hour
    ),
    by = c("origin", "time_hour")
    )%>%
  left_join(airlines, by = c("carrier" = "carrier")) %>%
  rename(Airline = name) %>%
  left_join(airports %>% select(faa, name, lat, lon, alt),
            by = c("origin" = "faa"))  %>%
  na.omit()

#Added variable quarter which defines which quarter of the year.
flights.2013$quarter<-as.character(quarter(flights.2013$time_hour))


#removing unwanted predictors.
flights.2013 <- flights.2013[, -c(1:5,7,8,10:15,17:21,23,27,28,35)]

# Create Training and Test data 
set.seed(100)
n <- nrow(flights.2013)
trainingRows <- sample(n, 0.7*n)
training <- flights.2013[trainingRows, ] # model training data
test <- flights.2013[-trainingRows, ]   # test data

#--------------------------------------------------------- 

# Check these outlier numbers, I switched to flights_model data subset
  
#Major oultiers which impacted the model
#training <- training[-c(16618, 849,19556),]

#--------------------------------------------------------


#Fitting linear model
glm.delay <-step(glm(arr_delay ~. , data = training), direction ="both")
## Start:  AIC=444988.3
## arr_delay ~ dep_delay + distance + dewp + wind_dir + wind_speed + 
##     wind_gust + visib + Airline + name + lat + lon + alt
## 
## 
## Step:  AIC=444988.3
## arr_delay ~ dep_delay + distance + dewp + wind_dir + wind_speed + 
##     wind_gust + visib + Airline + name + lat + lon
## 
## 
## Step:  AIC=444988.3
## arr_delay ~ dep_delay + distance + dewp + wind_dir + wind_speed + 
##     wind_gust + visib + Airline + name + lat
## 
## 
## Step:  AIC=444988.3
## arr_delay ~ dep_delay + distance + dewp + wind_dir + wind_speed + 
##     wind_gust + visib + Airline + name
## 
##              Df Deviance    AIC
## - wind_speed  1 15553826 444987
## <none>          15553681 444988
## - name        2 15555483 444990
## - wind_dir    1 15555737 444993
## - wind_gust   1 15556528 444996
## - dewp        1 15564696 445023
## - distance    1 15577784 445067
## - visib       1 15755959 445660
## - Airline    10 15870219 446018
## - dep_delay   1 94345162 538948
## 
## Step:  AIC=444986.8
## arr_delay ~ dep_delay + distance + dewp + wind_dir + wind_gust + 
##     visib + Airline + name
## 
##              Df Deviance    AIC
## <none>          15553826 444987
## + wind_speed  1 15553681 444988
## - name        2 15555735 444989
## - wind_dir    1 15555899 444992
## - dewp        1 15564852 445022
## - wind_gust   1 15569573 445038
## - distance    1 15577957 445066
## - visib       1 15756323 445659
## - Airline    10 15870310 446017
## - dep_delay   1 94357769 538953
summary(glm.delay)
## 
## Call:
## glm(formula = arr_delay ~ dep_delay + distance + dewp + wind_dir + 
##     wind_gust + visib + Airline + name, data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -63.509  -10.609   -1.938    8.404  156.464  
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     13.1603549  1.0500413  12.533  < 2e-16 ***
## dep_delay                        1.0065304  0.0019590 513.801  < 2e-16 ***
## distance                        -0.0012854  0.0001430  -8.991  < 2e-16 ***
## dewp                             0.0258861  0.0042592   6.078 1.23e-09 ***
## wind_dir                        -0.0025814  0.0009796  -2.635 0.008411 ** 
## wind_gust                        0.1006909  0.0138632   7.263 3.83e-13 ***
## visib                           -1.4155539  0.0543495 -26.045  < 2e-16 ***
## AirlineAmerican Airlines Inc.   -7.6012114  0.7418893 -10.246  < 2e-16 ***
## AirlineDelta Air Lines Inc.     -7.0337257  0.7280455  -9.661  < 2e-16 ***
## AirlineEndeavor Air Inc.        -7.5003199  0.8017922  -9.354  < 2e-16 ***
## AirlineEnvoy Air                 0.0019361  0.7433807   0.003 0.997922    
## AirlineExpressJet Airlines Inc. -4.7638340  0.7437730  -6.405 1.52e-10 ***
## AirlineJetBlue Airways          -2.5174547  0.7425629  -3.390 0.000699 ***
## AirlineSouthwest Airlines Co.   -8.6183342  0.7940285 -10.854  < 2e-16 ***
## AirlineUnited Air Lines Inc.    -7.8690544  0.7433501 -10.586  < 2e-16 ***
## AirlineUS Airways Inc.          -2.3554387  0.7566738  -3.113 0.001854 ** 
## AirlineVirgin America           -9.0928905  0.9840717  -9.240  < 2e-16 ***
## nameLa Guardia                  -0.0685840  0.2361177  -0.290 0.771461    
## nameNewark Liberty Intl         -0.5742693  0.2620686  -2.191 0.028435 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 298.5093)
## 
##     Null deviance: 100837605  on 52123  degrees of freedom
## Residual deviance:  15553826  on 52105  degrees of freedom
## AIC: 444987
## 
## Number of Fisher Scoring iterations: 2
#test correlation
pred.step.t <- predict(glm.delay, newdata = test, type = "response")
test$predt<-pred.step.t

#correlation between actual and predicted
cor(test$predt,test$arr_delay)*100
## [1] 92.42794
#training correlation
pred.step.tr <- predict(glm.delay, type = "response")
training$predtr<-pred.step.tr

#correlation between actual and predicted
cor(training$predtr,training$arr_delay)*100
## [1] 91.96487
#Graph between actual and predicted values
g2<-ggplot(test, aes(
  x = arr_delay,
  y = predt
  )) +
  geom_point() +
  geom_line(aes(x=arr_delay,y=arr_delay))+
  xlab("Actual values") +
  ylab("Predicted values") +
  ggtitle("Actual vs predicted values") +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))
ggplotly(g2)
#Mean squared error value
mean((test$predt-test$arr_delay)^2)
## [1] 300.4231
#diagnostic plots
plot(glm.delay)

#Testing the model 
#CrossValidation taking 7 folds
k<-7
nrow(flights.2013)
## [1] 74463
## [1] 77434
fold <- as.numeric(cut_number(1:nrow(flights.2013), k))

#Taking Sample Fold
fold <- sample(fold,length(fold))
fsize <- table(fold)

mse <- vector(length=k)

#Checking Error for every K folds
for (i in 1:k){
foldi <- flights.2013[fold==i,]
foldOther <- flights.2013[fold!=i,]
f <- lm(arr_delay ~ ., foldOther)
pred <- predict(f, foldi)
mse[i] <- mean((pred - foldi$arr_delay)^2) # MSEi
}

#Mean Error for the Model
mean(mse)
## [1] 299.2001